library(rsample)
library(tidyverse)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

set.seed(65)

fldata <- read_csv("data/fl_ready_for_analysis.csv")


#A little data cleaning

#make sure all the other variables are coded correctly
#recode party as a factor
fldata$party <- factor(fldata$party)
#round predictions to two digits
fldata_rf <- mutate(fldata, pred.whi = round(pred.whi, 2),
                   pred.bla = round(pred.bla, 2),
                   pred.lat = round(pred.lat, 2),
                   pred.asi = round(pred.asi, 2),
                   pred.oth = round(pred.oth, 2),
                   donations.per.cap = round(donations.per.cap, 0),
                   houseprice_rounded = round(median.house.price, -4))

#subset down to variables we actually want to use
fldata_rf <- select(fldata_rf, sr.race, party, sex, age, pred.whi, pred.bla, pred.lat, pred.asi, pred.oth, income_rounded, college_rounded, pred.vote, zip.population, donations.per.cap, donor, cvap.pct.white, cvap.pct.asian, cvap.pct.black, cvap.pct.latino, homeowner_rounded, houseprice_rounded)

#Remove all observations with missing data (because the random forest package does not accept missing data)
fldata_rf <- na.omit(fldata_rf)
#drop "Unknown" race as missing data
fldata_rf <- filter(fldata_rf, sr.race != "Unknown")

#make the outcome variable a factor
fldata_rf$sr.race <- factor(fldata_rf$sr.race)


##Split the data into three sets
#define a boundary
fl_60<- initial_split(fldata_rf, prop = .6)
#call the training data into existence
fl_train <- training(fl_60)
#save training data to dropbox
write.csv(fl_train, file = "data/FL_train_60.csv")
#call the test data into existence
fl_test40 <- testing(fl_60)
#split the test data into two parts
fl_20 <- initial_split(fl_test40, prop = .5)
#create the validation 20% and testing 20%
fl_valid <- training(fl_20)
fl_test <- testing(fl_20)
write.csv(fl_valid, file = "data/FL_validate_20.csv")
write.csv(fl_test, file = "data/FL_test_20.csv")
rm(fl_test40)



